1. Problematyka braków danych

1 Wczytanie potrzebych pakietów

library(VIM)
library(naniar)
library(panelView)
library(ggplot2)
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
from upsetty import Upset

2 Przykład 1: dane przekrojowe

Wczytujemy zbiór danych data2-cross_sectional.csv

df_cross <- read.csv("../data/data2-cross_sectional.csv")
head(df_cross)
            y        x1 x2 x3
1 -2.59353983 -2.309169 NA NA
2 -2.57878220 -1.966617 NA NA
3 -0.34323612 -1.686693 NA NA
4  0.02211028 -1.548753 NA NA
5  0.86592554 -1.265396 NA NA
6 -1.03985622 -1.265061 NA NA

Proste podsumowanie

summary(df_cross)
       y                   x1                 x2                x3          
 Min.   :-2.593540   Min.   :-2.30917   Min.   :-1.6179   Min.   :-1.75653  
 1st Qu.:-0.726888   1st Qu.:-0.51570   1st Qu.:-0.7684   1st Qu.:-0.53141  
 Median : 0.080384   Median :-0.04523   Median :-0.2063   Median : 0.03407  
 Mean   : 0.000809   Mean   : 0.06537   Mean   :-0.1191   Mean   :-0.09317  
 3rd Qu.: 0.839176   3rd Qu.: 0.72101   3rd Qu.: 0.4566   3rd Qu.: 0.49989  
 Max.   : 2.275646   Max.   : 2.18733   Max.   : 2.1001   Max.   : 1.59851  
                     NA's   :20         NA's   :24        NA's   :72        

Wizualizacja z pakietem VIM

vim_result <- aggr(x = df_cross)

summary(vim_result)

 Missings per variable: 
 Variable Count
        y     0
       x1    20
       x2    24
       x3    72

 Missings in combinations of variables: 
 Combinations Count Percent
      0:0:0:0    20      20
      0:0:0:1    39      39
      0:0:1:0     2       2
      0:0:1:1    19      19
      0:1:0:0     6       6
      0:1:0:1    11      11
      0:1:1:1     3       3

Wizualizacja z pakietem naniar.

vis_miss(df_cross)

vis_miss(df_cross, cluster = T, sort_miss = T)

gg_miss_var(df_cross)

gg_miss_upset(df_cross)

ggplot(data=df_cross, aes(x = x1, y)) + geom_point()
Warning: Removed 20 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(data=df_cross, aes(x = x1, y)) + geom_miss_point()

df_cross = pd.read_csv("../data/data2-cross_sectional.csv")
df_cross.head()
          y        x1  x2  x3
0 -2.593540 -2.309169 NaN NaN
1 -2.578782 -1.966617 NaN NaN
2 -0.343236 -1.686693 NaN NaN
3  0.022110 -1.548753 NaN NaN
4  0.865926 -1.265396 NaN NaN
miss_plot = msno.matrix(df_cross)
plt.show()

plt.ioff()
<contextlib.ExitStack object at 0x16f218a90>
miss_dendro = msno.dendrogram(df_cross)
plt.show()

upset = Upset.generate_plot(df_cross.isnull())
upset.show()

3 Przykład 2: dane panelowe

Wczytujemy dane w dwóch formatach: data/data2-panel_long.csv, data/data2-panel_wide.csv

df_long <- read.csv("../data/data2-panel_long.csv")
head(df_long)
  unit_id year         y         x1         x2
1       1 2015 -3.530126 -2.3622119 -2.6785088
2       1 2016 -3.204725 -2.8099225 -1.0101543
3       1 2017 -3.034548 -2.4910414 -1.8172416
4       1 2018 -2.720240 -1.4998663 -1.9648143
5       1 2019 -2.182205 -1.4482605 -1.6537966
6       1 2020 -1.840414 -0.6895582 -0.9912898
df_wide <- read.csv("../data/data2-panel_wide.csv")
head(df_wide)
  unit_id    y.2015      x1.2015    x2.2015     y.2016    x1.2016    x2.2016
1       1 -3.530126 -2.362211936 -2.6785088 -3.2047254 -2.8099225 -1.0101543
2       2  0.568902 -0.424010177  0.9818217  0.6039925         NA         NA
3       3  1.427478  0.005015081  1.2572458  0.8887155         NA         NA
4       4 -3.477064 -2.502756453 -1.9826823 -2.6826293 -2.1249015 -1.7178874
5       5 -2.007676           NA         NA -1.1877712 -1.8131336 -1.3596098
6       6 -2.059100 -1.190443674 -1.9837228 -1.0404844 -0.9521887 -0.6128572
      y.2017    x1.2017     x2.2017     y.2018    x1.2018    x2.2018     y.2019
1 -3.0345484 -2.4910414 -1.81724159 -2.7202403 -1.4998663 -1.9648143 -2.1822049
2  0.6704031 -0.8117871  0.80569343  1.3155977  0.5093892  0.5064433  1.2432841
3  0.8473622  0.2419629  0.01145462  1.0985260  0.5821369  1.0316695  1.6015851
4 -2.5445519         NA          NA -2.8655684 -1.2009330 -2.1596327 -2.3762645
5 -1.2179854         NA          NA -1.2431829 -0.5268546 -0.0286618 -1.3686251
6 -1.9270956 -1.6184360 -1.53674341 -0.2946865  0.1375292  0.3467936 -0.4073164
     x1.2019    x2.2019      y.2020    x1.2020     x2.2020      y.2021
1 -1.4482605 -1.6537966 -1.84041446 -0.6895582 -0.99128978 -2.25887006
2  0.6202672  0.4817378  1.76210283  1.6164227  1.00273225  1.01640057
3  1.7534910  0.3221606  1.59996491  0.9481215 -0.06525444          NA
4 -1.1665352 -1.4703061 -2.04844719 -0.9363446 -1.51867585 -2.23447897
5 -0.8229503 -0.3181881 -0.86471969  0.1961181 -0.71225250 -0.53040779
6 -0.4208893 -0.1267792  0.04081512 -0.1021241 -0.32718542 -0.04982249
     x1.2021    x2.2021      y.2022    x1.2022     x2.2022       y.2023
1 -1.3144860 -0.9215337 -2.03611946 -1.0736086 -1.82785228 -0.952456617
2  0.5519369  0.3990350  1.56796215  0.9857544  1.19197201  1.879899271
3         NA         NA          NA         NA          NA           NA
4 -1.3350168 -1.8754815 -1.69493361 -0.1758811 -1.23627143 -1.560217747
5  0.3277557 -1.0525771 -0.06009162 -0.9959412  0.39601553  0.004972416
6 -0.1200651  0.2559760 -0.59869442 -0.3174002 -0.08000885 -0.653598980
       x1.2023    x2.2023     y.2024    x1.2024    x2.2024
1  0.006495037 -0.3005491 -1.9426917 -1.1068217 -0.5397062
2           NA         NA  1.6401849  1.3837963  0.5168941
3           NA         NA         NA         NA         NA
4           NA         NA -1.4017691 -0.3189286 -0.7431038
5 -0.378907252 -0.2225839 -0.8976668 -0.3185950  0.1648471
6 -0.233184890 -0.5287449  0.9094604  0.5683409  0.1542975
VIM::aggr(df_wide)

panelview(data = df_long, formula = y ~ 1, index = c("unit_id", "year"), type = "missing")

panelview(data = df_long, formula = x2 ~ 1, index = c("unit_id", "year"), type = "missing")

panelview(data = df_long, formula = 1 ~ y + x1 + x2, index = c("unit_id", "year"), type = "missing")

df_long = pd.read_csv("../data/data2-panel_long.csv")
df_long.head()
   unit_id  year         y        x1        x2
0        1  2015 -3.530126 -2.362212 -2.678509
1        1  2016 -3.204725 -2.809923 -1.010154
2        1  2017 -3.034548 -2.491041 -1.817242
3        1  2018 -2.720240 -1.499866 -1.964814
4        1  2019 -2.182205 -1.448260 -1.653797
df_wide = pd.read_csv("../data/data2-panel_wide.csv")
df_wide.head()
   unit_id    y.2015   x1.2015  ...    y.2024   x1.2024   x2.2024
0        1 -3.530126 -2.362212  ... -1.942692 -1.106822 -0.539706
1        2  0.568902 -0.424010  ...  1.640185  1.383796  0.516894
2        3  1.427478  0.005015  ...       NaN       NaN       NaN
3        4 -3.477064 -2.502756  ... -1.401769 -0.318929 -0.743104
4        5 -2.007676       NaN  ... -0.897667 -0.318595  0.164847

[5 rows x 31 columns]
missing_pattern = df_long.pivot(index='unit_id', columns='year', values='y').isnull()
plt.figure(figsize=(12, 8))
sns.heatmap(missing_pattern, cbar=True, cmap='binary')
plt.title('Panel View of Missing Data')
plt.show()